In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
%matplotlib inline
import os
import pylab
import numpy as np
import umap
import glob
In [2]:
sns.set(context='paper', style='darkgrid', rc={'figure.facecolor':'white'}, font_scale=2)
In [95]:
#get file names in directory that end in csv
os.chdir('/Users/s1249052/PhD/flow data/vac69a/t cells only/experiment_210618_files/csv_by_person/all')
os.getcwd()
filenames = glob.glob('*.csv')
print(filenames, len(filenames))
['baseline_02_UMAP.csv', 'baseline_03_UMAP.csv', 'baseline_05_UMAP.csv', 'baseline_06_UMAP.csv', 'baseline_07_UMAP.csv', 'baseline_09_UMAP.csv', 'C+10_02_UMAP.csv', 'C+10_03_UMAP.csv', 'C+10_05_UMAP.csv', 'C+10_06_UMAP.csv', 'C+10_07_UMAP.csv', 'C+10_09_UMAP.csv', 'C+12_02_UMAP.csv', 'C+12_03_UMAP.csv', 'C+12_05_UMAP.csv', 'C+12_06_UMAP.csv', 'C+12_07_UMAP.csv', 'C+12_09_UMAP.csv', 'DoD_02_UMAP.csv', 'DoD_03_UMAP.csv', 'DoD_05_UMAP.csv', 'DoD_06_UMAP.csv', 'DoD_07_UMAP.csv', 'DoD_09_UMAP.csv', 'T+6_02_UMAP.csv', 'T+6_03_UMAP.csv', 'T+6_05_UMAP.csv', 'T+6_06_UMAP.csv', 'T+6_07_UMAP.csv', 'T+6_09_UMAP.csv'] 30
In [96]:
big_boi = pd.DataFrame()

for n in filenames:
    file = pd.read_csv(n)
    file['id'] = '{}'.format(n)
    big_boi = big_boi.append([file], ignore_index=True, sort=False)


print(len(big_boi))
print(len(big_boi.columns))
977807
39
In [97]:
#print(big_boi.columns)
#print(len(big_boi.columns))
print(big_boi.iloc[:, 1:35].head(n=2))
   115In_CD57  141Pr_HLA-DR  142Nd_BCL-2  143Nd_CD45RA  144Nd_GZB  145Nd_CD4  \
0         0.0           0.0     4.687225      3.931365   0.000000    0.00000   
1         0.0           0.0     3.584899      3.505189   0.896894    5.45049   

   146Nd_Vd2  148Nd_ICOS  149Sm_CXCR5  150Nd_CD95     ...      167Er_CCR7  \
0   0.000000    0.000000          0.0     0.00000     ...        5.220753   
1   2.468887    1.956988          0.0     0.90938     ...        4.308445   

   168Er_CD127  169Tm_CD38  171Yb_CD49d  172Yb_CD25  173Yb_CD39  174Yb_CLA  \
0     4.078405    3.126917     3.486562    1.887629    1.316506   2.603347   
1     3.855416    5.087446     2.033088    0.870406    0.000000   0.279724   

   175Lu_Perforin  198Pt_CD8  209Bi_CD16  
0        1.417754   4.724484    0.000000  
1        0.000000   0.697346    0.464288  

[2 rows x 34 columns]
In [98]:
#######great
reducer = umap.UMAP(n_neighbors=20, min_dist=0.1, n_components=2, metric='euclidean')
%time embedding = reducer.fit_transform(big_boi.iloc[:, 1:35])
embedding.shape 
big_boi['umap1'] = embedding[:, 0]
big_boi['umap2'] = embedding[:,1]

channels=list(big_boi.columns.values)

for n in channels[0:35]:
    cmap=plt.get_cmap('nipy_spectral')
    fig=plt.figure(figsize=(15,10))
    ax=plt.subplot()
    ax.scatter(big_boi['umap1'], big_boi['umap2'], c=big_boi[n], cmap=cmap, s=1)
    plt.title('{}'.format(n))
    #plt.savefig('{}.pdf'.format(n))
    plt.show()
CPU times: user 1h 46min 2s, sys: 2min 52s, total: 1h 48min 54s
Wall time: 57min 6s
In [63]:
# tried but didn't look nice:

#(n_neighbors=15, min_dist=0.1, n_components=2, metric='euclidean')
#(n_neighbors=15, min_dist=0.2, n_components=2, metric='euclidean')
#(n_neighbors=20, min_dist=0.2, n_components=2, metric='euclidean')

# maybe try increasing nearest neighbor iteratively? makes sense that it would have to go up as
# a result of the much higher density in features conserved between people & timepoints

# ask irina about algorithm that finds good metaparameters
In [99]:
#print(np.unique(big_boi['id'])[0])
#maybs = big_boi.loc[big_boi['id'] == 'C+10_02_UMAP.csv']
#maybs.head(n=20)
C+10_02_UMAP.csv
Out[99]:
Unnamed: 0 115In_CD57 141Pr_HLA-DR 142Nd_BCL-2 143Nd_CD45RA 144Nd_GZB 145Nd_CD4 146Nd_Vd2 148Nd_ICOS 149Sm_CXCR5 ... 172Yb_CD25 173Yb_CD39 174Yb_CLA 175Lu_Perforin 198Pt_CD8 209Bi_CD16 Timepoint umap1 umap2 id
228984 41602 0.000000 0.000000 3.935311 2.713579 2.294961 5.226323 2.402305 0.556648 0.000000 ... 0.877445 0.827879 0.041487 1.215187 1.636192 0.861018 C+10 -5.126123 -7.202410 C+10_02_UMAP.csv
228985 41603 0.000000 0.000000 3.928290 1.653279 0.000000 5.214157 1.598988 0.000000 1.749907 ... 0.701401 0.000000 1.406371 0.000000 1.574973 2.632217 C+10 -3.886959 -6.829418 C+10_02_UMAP.csv
228986 41604 0.000000 0.000000 3.280547 3.937355 1.780693 4.887737 2.344482 0.000000 0.000000 ... 2.423846 1.633036 1.635920 1.488109 0.000000 0.764985 C+10 -5.099213 -7.074005 C+10_02_UMAP.csv
228987 41605 6.670011 0.000000 4.687460 4.526303 6.770259 0.000000 1.226518 0.000000 0.000000 ... 0.605792 0.000000 2.066458 7.181400 4.065031 1.793569 C+10 -1.946411 9.140014 C+10_02_UMAP.csv
228988 41606 0.000000 1.012880 3.522446 0.000000 0.569018 5.137049 1.232675 1.223021 0.104393 ... 2.613971 0.158086 2.283521 0.000000 0.002910 2.368846 C+10 0.596668 -3.941322 C+10_02_UMAP.csv
228989 41607 0.000000 0.000000 5.118576 0.145364 0.390451 0.000000 0.207504 0.000000 0.000000 ... 0.638453 0.152510 1.732544 0.650463 5.139588 0.000000 C+10 5.286784 8.954355 C+10_02_UMAP.csv
228990 41608 5.810825 0.000000 3.944827 4.633459 6.281823 4.475276 2.100314 1.374925 0.000000 ... 0.000000 0.000000 0.831452 6.011548 2.111776 1.356965 C+10 -1.420401 8.936044 C+10_02_UMAP.csv
228991 41609 7.090389 0.000000 4.271693 4.200678 6.365301 0.891432 0.000000 0.000000 0.000000 ... 0.453684 1.099886 1.039593 7.936172 2.986735 3.510896 C+10 -2.318273 10.328561 C+10_02_UMAP.csv
228992 41610 0.000000 0.000000 3.524437 3.280108 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.388075 0.045577 0.392367 3.090400 3.232912 0.000000 C+10 6.250589 7.120960 C+10_02_UMAP.csv
228993 41611 0.000000 0.000000 3.307719 1.808582 0.240535 5.359190 2.092781 0.000000 0.850915 ... 3.674670 0.428482 0.000000 0.000000 1.354325 0.000000 C+10 -2.575009 -6.680745 C+10_02_UMAP.csv
228994 41612 0.000000 0.000000 4.209272 1.645129 0.000000 5.009252 0.716818 1.739552 0.000000 ... 1.917916 0.000000 1.801387 2.052170 0.000000 0.210475 C+10 -3.443430 -7.587960 C+10_02_UMAP.csv
228995 41613 0.000000 0.000000 4.334081 4.399431 0.000000 0.380783 0.000000 0.000000 0.000000 ... 1.102676 1.327407 1.715953 0.000000 5.224417 0.778003 C+10 5.656419 8.901880 C+10_02_UMAP.csv
228996 41614 0.000000 0.000000 4.549118 3.217820 0.000000 5.552540 1.630839 0.000000 0.000000 ... 0.301416 1.359575 0.077738 1.442183 0.000000 0.620776 C+10 -3.934688 -7.169604 C+10_02_UMAP.csv
228997 41615 0.833054 0.000000 4.541667 2.894643 1.931292 5.004816 0.417972 0.000000 0.000000 ... 1.783129 0.472331 0.891427 1.425946 0.878771 0.000000 C+10 -4.646605 -7.647383 C+10_02_UMAP.csv
228998 41616 0.000000 0.000000 3.246357 2.744779 1.933608 4.901257 1.985625 0.000000 0.000000 ... 1.211386 0.800042 0.332599 1.277993 0.000000 0.000000 C+10 -5.341502 -7.287535 C+10_02_UMAP.csv
228999 41617 6.616857 1.321242 4.583064 4.580763 6.852770 1.233149 0.709707 1.386395 0.000000 ... 0.642968 0.000000 0.215956 7.517579 1.641727 0.328797 C+10 -2.075478 10.205151 C+10_02_UMAP.csv
229000 41618 4.306097 0.000000 3.815210 4.471703 5.827122 3.157962 0.859388 0.000000 0.000000 ... 0.000000 0.000000 0.335209 6.053554 3.004125 1.567613 C+10 -1.376538 8.799809 C+10_02_UMAP.csv
229001 41619 0.000000 0.000000 3.233580 3.190267 0.000000 5.164777 1.837170 0.376646 0.000000 ... 0.000000 1.746832 0.000000 0.000000 0.538482 2.169316 C+10 -4.772773 -7.267339 C+10_02_UMAP.csv
229002 41620 0.000000 0.016013 4.900650 2.036425 2.064460 5.319529 2.905207 0.676870 1.420269 ... 2.537121 0.000000 0.360131 1.963905 0.000000 0.021070 C+10 1.121436 -5.920226 C+10_02_UMAP.csv
229003 41621 0.000000 2.030388 4.197967 0.727653 2.568631 4.824194 2.398714 0.662993 0.000000 ... 0.000000 1.621251 0.000000 2.204221 1.807120 0.000000 C+10 3.387814 -3.811976 C+10_02_UMAP.csv

20 rows × 39 columns

In [103]:
os.chdir('/Users/s1249052/PhD/flow data/vac69a/t cells only/experiment_210618_files/csv_by_person/all/big_one')
for source in np.unique(big_boi['id']):
    new_df = pd.DataFrame()
    new_df = big_boi.loc[big_boi['id'] == source]
    new_df = new_df.drop(columns = ['id'])
    new_df = new_df.drop(columns = ['Unnamed: 0'])
    new_df = new_df.drop(columns = ['Timepoint'])                                   
    new_df.to_csv('{}'.format(source), sep=',') 
    
os.getcwd()
Out[103]:
'/Users/s1249052/PhD/flow data/vac69a/t cells only/experiment_210618_files/csv_by_person/all/big_one'
In [101]:
print(big_boi.columns)
Index(['Unnamed: 0', '115In_CD57', '141Pr_HLA-DR', '142Nd_BCL-2',
       '143Nd_CD45RA', '144Nd_GZB', '145Nd_CD4', '146Nd_Vd2', '148Nd_ICOS',
       '149Sm_CXCR5', '150Nd_CD95', '151Eu_CD103', '153Eu_Va7.2',
       '154Sm_TIM-3', '155Gd_PD1', '156Gd_CD161', '158Gd_CD27', '159Tb_FoxP3',
       '160Gd_CTLA4', '161Dy_Tbet', '162Dy_IntegrinB7', '163Dy_CD28',
       '164Dy_Ki-67', '165Ho_CD45RO', '166Er_CD56', '167Er_CCR7',
       '168Er_CD127', '169Tm_CD38', '171Yb_CD49d', '172Yb_CD25', '173Yb_CD39',
       '174Yb_CLA', '175Lu_Perforin', '198Pt_CD8', '209Bi_CD16', 'Timepoint',
       'umap1', 'umap2', 'id'],
      dtype='object')
In [102]:
Baseline_csv = sub_concat[sub_concat['Timepoint'] == 'C-1']
C_10_csv = sub_concat[sub_concat['Timepoint'] == 'C+10']
C_12_csv = sub_concat[sub_concat['Timepoint'] == 'C+12']
DoD_csv = sub_concat[sub_concat['Timepoint'] == 'DoD']
T_6_csv = sub_concat[sub_concat['Timepoint'] == 'T+6']

Baseline_csv = Baseline_csv.drop(columns = ['Timepoint'])
C_10_csv = C_10_csv.drop(columns = ['Timepoint'])
C_12_csv = C_12_csv.drop(columns = ['Timepoint'])
DoD_csv = DoD_csv.drop(columns = ['Timepoint'])
T_6_csv = T_6_csv.drop(columns = ['Timepoint'])
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-102-68146b351f94> in <module>()
----> 1 Baseline_csv = sub_concat[sub_concat['Timepoint'] == 'C-1']
      2 C_10_csv = sub_concat[sub_concat['Timepoint'] == 'C+10']
      3 C_12_csv = sub_concat[sub_concat['Timepoint'] == 'C+12']
      4 DoD_csv = sub_concat[sub_concat['Timepoint'] == 'DoD']
      5 T_6_csv = sub_concat[sub_concat['Timepoint'] == 'T+6']

NameError: name 'sub_concat' is not defined
In [ ]:
Baseline_csv.to_csv('baseline_09_UMAP.csv', sep=',') 
C_10_csv.to_csv('C+10_09_UMAP.csv', sep=',') 
C_12_csv.to_csv('C+12_09_UMAP.csv', sep=',') 
DoD_csv.to_csv('DoD_09_UMAP.csv', sep=',') 
T_6_csv.to_csv('T+6_09_UMAP.csv', sep=',') 

os.getcwd()